library(aplore3)
library(ggplot2)
library(dplyr)
library(plotly)
library(GGally)
library(ca)
library(RColorBrewer)
library(pheatmap)
library(cluster)
library(ggcorrplot)
library(dplyr)
library(tidyr)
library(sjPlot)
library(sjmisc)
library(caret)
library(MASS)
library(randomForest)
library(vcd)
library(FactoMineR)
library(pROC)
#??glow500

Data Format: A data.frame with 500 rows and 18 variables such as:

priorfrac - If the patient previously had a fracture
age - Age at Enrollment (years)
weight - Weight at enrollment (Kilograms)
height - Height at enrollment (Centimeters)
bmi - Body Mass Index (Kg/m^2)
premeno - Menopause before age 45 (1: No, 2: Yes)
momfrac - Mother had hip fracture (1: No, 2: Yes)
armassist - Arms are needed to stand from a chair (1: No, 2: Yes)
smoke - Former or current smoker (1: No, 2: Yes)
raterisk - Self-reported risk of fracture (1: Less than others of the same age, 2: Same as others of the same age, 3: Greater than others of the same age
fracscore - Fracture Risk Score (Composite Risk Score)
fracture - Any fracture in first year (1: No, 2: Yes)
bonemed - Bone medications at enrollment (1: No, 2: Yes)
bonemed_fu - Bone medications at follow-up (1: No, 2: Yes)
bonetreat - Bone medications both at enrollment and follow-up (1: No, 2: Yes)

rm(glow_bonemed)
## Warning in rm(glow_bonemed): object 'glow_bonemed' not found
data("glow_bonemed", package = "aplore3")
head(glow_bonemed)
##   sub_id site_id phy_id priorfrac age weight height      bmi premeno momfrac
## 1      1       1     14        No  62   70.3    158 28.16055      No      No
## 2      2       4    284        No  65   87.1    160 34.02344      No      No
## 3      3       6    305       Yes  88   50.8    157 20.60936      No     Yes
## 4      4       6    309        No  82   62.1    160 24.25781      No      No
## 5      5       1     37        No  61   68.0    152 29.43213      No      No
## 6      6       5    299       Yes  67   68.0    161 26.23356      No      No
##   armassist smoke raterisk fracscore fracture bonemed bonemed_fu bonetreat
## 1        No    No     Same         1       No      No         No        No
## 2        No    No     Same         2       No      No         No        No
## 3       Yes    No     Less        11       No      No         No        No
## 4        No    No     Less         5       No      No         No        No
## 5        No    No     Same         1       No      No         No        No
## 6        No   Yes     Same         4       No      No         No        No

Summary statistics for numeric variables

#Set fracscore to integer for summary statistics
glow_bonemed$fracscore = as.integer(glow_bonemed$fracscore)
mysummary = glow_bonemed %>%
  dplyr::select(age, weight, height, bmi, fracscore) %>%
  summarise_each(
    funs(min = min, 
    q25 = quantile(., 0.25), 
    median = median, 
    q75 = quantile(., 0.75), 
    max = max,
    mean = mean, 
    sd = sd,
    variance= var))

# reshape it using tidyr functions

clean.summary = mysummary %>% 
  gather(stat, val) %>%
  separate(stat, into = c("var", "stat"), sep = "_") %>%
  spread(stat, val) %>%
  dplyr::select(var, min, max, mean, sd, variance)

print(clean.summary)
##         var       min       max      mean        sd   variance
## 1       age  55.00000  90.00000  68.56200  8.989537  80.811780
## 2       bmi  14.87637  49.08241  27.55303  5.973958  35.688178
## 3 fracscore   0.00000  11.00000   3.69800  2.495446   6.227251
## 4    height 134.00000 199.00000 161.36400  6.355493  40.392289
## 5    weight  39.90000 127.00000  71.82320 16.435992 270.141825

Summary statistics for categorical variables

summary(glow_bonemed %>% dplyr::select(priorfrac, premeno, momfrac, armassist, smoke, raterisk, fracture, bonemed, bonemed_fu, bonetreat))
##  priorfrac premeno   momfrac   armassist smoke        raterisk   fracture 
##  No :374   No :403   No :435   No :312   No :465   Less   :167   No :375  
##  Yes:126   Yes: 97   Yes: 65   Yes:188   Yes: 35   Same   :186   Yes:125  
##                                                    Greater:147            
##  bonemed   bonemed_fu bonetreat
##  No :371   No :361    No :382  
##  Yes:129   Yes:139    Yes:118  
## 

No missing values

colSums(is.na(glow_bonemed))
##     sub_id    site_id     phy_id  priorfrac        age     weight     height 
##          0          0          0          0          0          0          0 
##        bmi    premeno    momfrac  armassist      smoke   raterisk  fracscore 
##          0          0          0          0          0          0          0 
##   fracture    bonemed bonemed_fu  bonetreat 
##          0          0          0          0
sum(is.na(glow_bonemed))
## [1] 0

Age vs Weight: As weight increases the average age decreases
Age vs Height: Weak correlation of as height increases age decreases
Age vs BMI: As bmi increases the average age decreases
Age vs fracscore: As age increases the average fracscore increases

Weight vs Height: As height increases the average weight increases
Weight vs BMI: As bmi increases the average weight increases
Weight vs fracscore: As fracscore increases the average Weight decreases

Height vs BMI: As bmi increases the average height and variance stay the same
Height vs fracscore: As fracscore increases the average height stays the same though variance might decrease

BMI vs fracscore: As fracscore increases the average bmi decreases

plot(glow_bonemed[, c(5:8, 14)])

ggplotly(ggpairs(glow_bonemed[, c(5:8, 14)], ggplot2::aes(color = glow_bonemed$fracture), lower = list(continuous = wrap("smooth", alpha = 0.5, size = 0.9))))
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning: Can only have one: highlight
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning: Can only have one: highlight
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning: Can only have one: highlight
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning: Can only have one: highlight

Non of the following scatter plots show strong groupings for the fracture/no fracture categorical variable

ggplot(glow_bonemed, aes(x = age, y = bmi, color = fracture)) +
  geom_jitter() +
  labs(title = "BMI vs Age")

ggplot(glow_bonemed, aes(x = bmi, y = fracscore, color = fracture)) +
  geom_jitter() +
  labs(title = "Fracture Score vs BMI")

ggplot(glow_bonemed, aes(x = fracscore, y = age, color = fracture)) +
  geom_jitter() +
  labs(title = "Age vs Fracture Score")

ggplot(glow_bonemed, aes(x = weight, y = height, color = fracture)) +
  geom_jitter() +
  labs(title = "Height vs Weight")

Once again there doesn’t seem to be strong groupings of the fracture categorical variable

fracture3dplot = plot_ly(glow_bonemed, 
  x = ~age, 
  y = ~height, 
  z = ~bmi, 
  color = ~fracture, 
  colors = c('#0C4B8E', '#BF382A')) %>% add_markers()

fracture3dplot

There are so little “yes” fracture results that the plot isn’t very useful

## `geom_smooth()` using formula = 'y ~ x'

The boxplot shows that the mean fracscore seems to be slightly higher for smokers compared to non smokers

ggplot(glow_bonemed, aes(x = smoke, y = fracscore)) +
  geom_boxplot() +
  labs(title = "Fracture Score Summary Statistics for Smokers vs Non Smokers")

ggplot(glow_bonemed, aes(x = fracture, y = fracscore)) +
  geom_boxplot() +
  labs(title = "Fracture score distribution for fracture groups")

ggplot(glow_bonemed, aes(x = fracscore, y = bonetreat, color = fracture)) + 
  geom_point(position = position_jitter(width = 0.2, height = 0.1)) +
  labs(x = "Fracture Score", y = "Recieved bone medication treatment at enrollment and at follow up", color = "Fracture") +
  ggtitle("Difference in Fracture Score vs bonetreatement at enrollment and follow up")

Plot confirms there is a strong correlation between age/fracscore, bmi/weight

corr_vars <- c("age", "weight", "height", "bmi", "fracscore")

corr_df <- glow_bonemed[, corr_vars]
corr_df <- cor(corr_df)

ggcorrplot(corr = corr_df, lab = TRUE, lab_size = 2,
  colors = c("#6D9EC1", "white", "#E46726")) +
  labs(title = "Correlation Between Variables") +
  theme(plot.title = element_text(hjust = .5),
  plot.subtitle = element_text(hjust = .5))

Categorical Variable EDA

par(mfrow = c(2,4))

mosaicplot(table(glow_bonemed$bonetreat, glow_bonemed$fracture), 
           color = TRUE,
           xlab = "bonetreat",
           ylab = "Fracture",
           main = "Bone Treatement vs Fracture")
mosaicplot(table(glow_bonemed$priorfrac, glow_bonemed$fracture), 
           color = TRUE,
           xlab = "priorfrac",
           ylab = "Fracture",
           main = "Prior Fracture vs Fracture")
mosaicplot(table(glow_bonemed$bonemed, glow_bonemed$fracture), 
           color = TRUE,
           xlab = "bonemed",
           ylab = "Fracture",
           main = "Bonemed vs Fracture")
mosaicplot(table(glow_bonemed$bonemed_fu, glow_bonemed$fracture), 
           color = TRUE,
           xlab = "bonemed_fu",
           ylab = "Fracture",
           main = "Bonemed_fu vs Fracture")
mosaicplot(table(glow_bonemed$smoke, glow_bonemed$fracture), 
           color = TRUE,
           xlab = "smoke",
           ylab = "Fracture",
           main = "Smoke vs Fracture")
mosaicplot(table(glow_bonemed$armassist, glow_bonemed$fracture), 
           color = TRUE,
           xlab = "armassist",
           ylab = "Fracture",
           main = "Armassist vs Fracture")
mosaicplot(table(glow_bonemed$momfrac, glow_bonemed$fracture), 
           color = TRUE,
           xlab = "momfrac",
           ylab = "Fracture",
           main = "Mother Fracture vs Fracture")
mosaicplot(table(glow_bonemed$premeno, glow_bonemed$fracture), 
           color = TRUE,
           xlab = "premeno",
           ylab = "Fracture",
           main = "Premeno Treatement vs Fracture")

chisq.test(table(glow_bonemed$priorfrac, glow_bonemed$fracture))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(glow_bonemed$priorfrac, glow_bonemed$fracture)
## X-squared = 22.635, df = 1, p-value = 1.959e-06
chisq.test(table(glow_bonemed$bonemed, glow_bonemed$fracture))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(glow_bonemed$bonemed, glow_bonemed$fracture)
## X-squared = 9.7822, df = 1, p-value = 0.001762
chisq.test(table(glow_bonemed$bonemed_fu, glow_bonemed$fracture))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(glow_bonemed$bonemed_fu, glow_bonemed$fracture)
## X-squared = 16.743, df = 1, p-value = 4.279e-05
chisq.test(table(glow_bonemed$bonetreat, glow_bonemed$fracture))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(glow_bonemed$bonetreat, glow_bonemed$fracture)
## X-squared = 5.9159, df = 1, p-value = 0.015

Perform chi-squared test on all categorical variables

glow_bonemed$fracscore = as.factor(glow_bonemed$fracscore)

blow_bonemed_categoricals = glow_bonemed[, c(4, 9:18)]

multipleCorrespondenceAnalysis = MCA(blow_bonemed_categoricals, quali.sup = "fracture", graph = FALSE)
multipleCorrespondenceAnalysis
## **Results of the Multiple Correspondence Analysis (MCA)**
## The analysis was performed on 500 individuals, described by 11 variables
## *The results are available in the following objects:
## 
##    name                description                                          
## 1  "$eig"              "eigenvalues"                                        
## 2  "$var"              "results for the variables"                          
## 3  "$var$coord"        "coord. of the categories"                           
## 4  "$var$cos2"         "cos2 for the categories"                            
## 5  "$var$contrib"      "contributions of the categories"                    
## 6  "$var$v.test"       "v-test for the categories"                          
## 7  "$ind"              "results for the individuals"                        
## 8  "$ind$coord"        "coord. for the individuals"                         
## 9  "$ind$cos2"         "cos2 for the individuals"                           
## 10 "$ind$contrib"      "contributions of the individuals"                   
## 11 "$quali.sup"        "results for the supplementary categorical variables"
## 12 "$quali.sup$coord"  "coord. for the supplementary categories"            
## 13 "$quali.sup$cos2"   "cos2 for the supplementary categories"              
## 14 "$quali.sup$v.test" "v-test for the supplementary categories"            
## 15 "$call"             "intermediate results"                               
## 16 "$call$marge.col"   "weights of columns"                                 
## 17 "$call$marge.li"    "weights of rows"

All of the categorical variables seem to have significant P values with bonetreate, bonemed, and bonemed_fu all having R^2 above .83

firstDimension = dimdesc(multipleCorrespondenceAnalysis, axes = 1)
firstDimension
## $`Dim 1`
## 
## Link between the variable and the categorical variable (1-way anova)
## =============================================
##                    R2       p.value
## bonetreat  0.87245886 7.754159e-225
## bonemed    0.84726742 2.443029e-205
## bonemed_fu 0.83803940 5.422994e-199
## raterisk   0.19571783  3.118876e-24
## fracscore  0.19131230  2.223696e-17
## priorfrac  0.10838142  4.201009e-14
## fracture   0.04922945  5.400633e-07
## armassist  0.02280481  7.047091e-04
## premeno    0.01718799  3.315107e-03
## smoke      0.01441500  7.194873e-03
## 
## Link between variable and the categories of the categorical variables
## ================================================================
##                              Estimate       p.value
## bonetreat=bonetreat_Yes    0.61370510 7.754159e-225
## bonemed=bonemed_Yes        0.58693281 2.443029e-205
## bonemed_fu=bonemed_fu_Yes  0.57007390 5.422994e-199
## raterisk=Greater           0.32172629  2.159398e-19
## priorfrac=priorfrac_Yes    0.21155159  4.201009e-14
## fracscore=10               1.42877661  4.142645e-07
## fracture=fracture_Yes      0.14295580  5.400633e-07
## fracscore=9                0.47787785  1.527510e-04
## fracscore=8                0.21858281  3.120054e-04
## armassist=armassist_Yes    0.08697950  7.047091e-04
## fracscore=7                0.09452369  8.158244e-04
## premeno=premeno_No         0.09249838  3.315107e-03
## smoke=smoke_No             0.13128246  7.194873e-03
## fracscore=5               -0.05350173  4.364518e-02
## smoke=smoke_Yes           -0.13128246  7.194873e-03
## premeno=premeno_Yes       -0.09249838  3.315107e-03
## fracscore=1               -0.40626672  1.010225e-03
## armassist=armassist_No    -0.08697950  7.047091e-04
## fracscore=0               -0.49480779  3.625746e-05
## fracture=fracture_No      -0.14295580  5.400633e-07
## priorfrac=priorfrac_No    -0.21155159  4.201009e-14
## raterisk=Less             -0.30243929  2.386909e-17
## bonemed_fu=bonemed_fu_No  -0.57007390 5.422994e-199
## bonemed=bonemed_No        -0.58693281 2.443029e-205
## bonetreat=bonetreat_No    -0.61370510 7.754159e-225

bonetreat, bonemed, bonemed_fu all have a sensitivity around 78% and specificity between 32% and 42%

confusionMatrix(table(glow_bonemed$bonetreat, glow_bonemed$fracture)) 
## Confusion Matrix and Statistics
## 
##      
##        No Yes
##   No  297  85
##   Yes  78  40
##                                         
##                Accuracy : 0.674         
##                  95% CI : (0.631, 0.715)
##     No Information Rate : 0.75          
##     P-Value [Acc > NIR] : 0.9999        
##                                         
##                   Kappa : 0.1141        
##                                         
##  Mcnemar's Test P-Value : 0.6384        
##                                         
##             Sensitivity : 0.7920        
##             Specificity : 0.3200        
##          Pos Pred Value : 0.7775        
##          Neg Pred Value : 0.3390        
##              Prevalence : 0.7500        
##          Detection Rate : 0.5940        
##    Detection Prevalence : 0.7640        
##       Balanced Accuracy : 0.5560        
##                                         
##        'Positive' Class : No            
## 
confusionMatrix(table(glow_bonemed$bonemed, glow_bonemed$fracture))
## Confusion Matrix and Statistics
## 
##      
##        No Yes
##   No  292  79
##   Yes  83  46
##                                          
##                Accuracy : 0.676          
##                  95% CI : (0.633, 0.7169)
##     No Information Rate : 0.75           
##     P-Value [Acc > NIR] : 0.9999         
##                                          
##                   Kappa : 0.1451         
##                                          
##  Mcnemar's Test P-Value : 0.8137         
##                                          
##             Sensitivity : 0.7787         
##             Specificity : 0.3680         
##          Pos Pred Value : 0.7871         
##          Neg Pred Value : 0.3566         
##              Prevalence : 0.7500         
##          Detection Rate : 0.5840         
##    Detection Prevalence : 0.7420         
##       Balanced Accuracy : 0.5733         
##                                          
##        'Positive' Class : No             
## 
confusionMatrix(table(glow_bonemed$bonemed_fu, glow_bonemed$fracture))
## Confusion Matrix and Statistics
## 
##      
##        No Yes
##   No  289  72
##   Yes  86  53
##                                           
##                Accuracy : 0.684           
##                  95% CI : (0.6412, 0.7246)
##     No Information Rate : 0.75            
##     P-Value [Acc > NIR] : 0.9996          
##                                           
##                   Kappa : 0.1877          
##                                           
##  Mcnemar's Test P-Value : 0.3010          
##                                           
##             Sensitivity : 0.7707          
##             Specificity : 0.4240          
##          Pos Pred Value : 0.8006          
##          Neg Pred Value : 0.3813          
##              Prevalence : 0.7500          
##          Detection Rate : 0.5780          
##    Detection Prevalence : 0.7220          
##       Balanced Accuracy : 0.5973          
##                                           
##        'Positive' Class : No              
## 
glow_bonemed$fracscore = as.factor(glow_bonemed$fracscore)

blow_bonemed_categoricals = glow_bonemed[, c(15:18)]

multipleCorrespondenceAnalysis = MCA(blow_bonemed_categoricals, graph = FALSE)

plot.MCA(multipleCorrespondenceAnalysis, 
         cex = 0.8, 
         col.quali.sup = c("red", "blue", "green"), 
         title = "Multiple Correspondence Analysis")

Clustering EDA

pheatmap(glow_bonemed[, c(5,8)], scale = "column", fontsize_row = 0.1, cluster_cols = F, legend = T, color = colorRampPalette(c("blue", "white", "red"), space = "rgb")(100))

pheatmap(glow_bonemed[, 5:8], scale = "column", fontsize_row = 0.1, cluster_cols = F, legend = T, color = colorRampPalette(c("blue", "white", "red"), space = "rgb")(100))

zScoreScale = scale(glow_bonemed[, 5:8])

zScoreDistance = dist(zScoreScale)

continuousVariableClustering = hclust(zScoreDistance, method = "complete")

plot(continuousVariableClustering)

Modeling

Split the data into a training/testing set

set.seed(4)

trainingIndices = sample(c(1:dim(glow_bonemed)[1]), dim(glow_bonemed)[1]*0.8)

trainingDataframe = glow_bonemed[trainingIndices,]
testingDataframe = glow_bonemed[-trainingIndices,]

Age is the only statistically significant continuous variable at the alpha = 0.2 level (p < 0.0001)

model = glm(fracture ~ age + weight + height + bmi, data = glow_bonemed[trainingIndices,], family = "binomial")

summary(model)
## 
## Call:
## glm(formula = fracture ~ age + weight + height + bmi, family = "binomial", 
##     data = glow_bonemed[trainingIndices, ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2770  -0.7951  -0.6672   1.2602   1.9710  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.006590  13.891104  -0.432 0.665447    
## age          0.045888   0.013481   3.404 0.000664 ***
## weight      -0.039155   0.095610  -0.410 0.682156    
## height       0.008938   0.085578   0.104 0.916814    
## bmi          0.113632   0.247382   0.459 0.645993    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 458.45  on 399  degrees of freedom
## Residual deviance: 442.53  on 395  degrees of freedom
## AIC: 452.53
## 
## Number of Fisher Scoring iterations: 4
AIC(model)
## [1] 452.5326

bonetreat 0.87245886 7.754159e-225 bonemed 0.84726742 2.443029e-205 bonemed_fu 0.83803940 5.422994e-199 raterisk 0.19571783 3.118876e-24 fracscore 0.19131230 2.223696e-17 priorfrac 0.10838142 4.201009e-14 fracture 0.04922945 5.400633e-07 armassist 0.02280481 7.047091e-04 premeno 0.01718799 3.315107e-03 smoke 0.01441500 7.194873e-03

model = glm(fracture ~ age + bonetreat + bonemed + bonemed_fu , data = glow_bonemed[trainingIndices,], family = "binomial")

summary(model)
## 
## Call:
## glm(formula = fracture ~ age + bonetreat + bonemed + bonemed_fu, 
##     family = "binomial", data = glow_bonemed[trainingIndices, 
##         ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4723  -0.7634  -0.6110   0.8596   1.9992  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -4.00531    0.93027  -4.306 1.67e-05 ***
## age            0.03844    0.01326   2.898  0.00376 ** 
## bonetreatYes  -2.50872    0.88413  -2.838  0.00455 ** 
## bonemedYes     1.39128    0.70199   1.982  0.04749 *  
## bonemed_fuYes  1.71650    0.51300   3.346  0.00082 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 458.45  on 399  degrees of freedom
## Residual deviance: 428.21  on 395  degrees of freedom
## AIC: 438.21
## 
## Number of Fisher Scoring iterations: 4
AIC(model)
## [1] 438.2137
# trying to figure out how to use sjPlot to mimic what we did in unit12 prelive
#plot_model(model, type = "pred", terms = c("age", "smoke"))

Get eigen vectors and values for principle component analysis

glow_bonemed$fracscore = as.integer(glow_bonemed$fracscore)
corr_vars <- c("age", "weight", "height", "bmi", "fracscore")
pc.result<-prcomp(glow_bonemed[, corr_vars],scale.=TRUE)

#Eigen Vectors
pc.result$rotation
##                  PC1         PC2         PC3         PC4          PC5
## age        0.4947219  0.46742140 -0.15246583  0.71654567 -0.009160237
## weight    -0.5273035  0.46578775 -0.08840991  0.03240244 -0.704362523
## height    -0.2345770 -0.08196149 -0.93823245  0.01885633  0.240042129
## bmi       -0.4741030  0.51615173  0.24533677  0.05137380  0.667820563
## fracscore  0.4442985  0.53984137 -0.16872342 -0.69463484  0.013601399
#Eigen Values
eigenvals<-pc.result$sdev^2
eigenvals
## [1] 2.374116591 1.523507236 0.975710317 0.123469514 0.003196342

Scree plot

par(mfrow = c(1, 2))

plot(eigenvals,type = "l",
     main = "Scree Plot",
     ylab = "Eigen Values",
     xlab = "PC #")

plot(eigenvals / sum(eigenvals), 
     type = "l", main = "Scree Plot", 
     ylab = "Prop. Var. Explained", 
     xlab = "PC #", 
     ylim = c(0, 1))

cumulative.prop = cumsum(eigenvals / sum(eigenvals))

lines(cumulative.prop, lty = 2)

Naive Bayes model with categorical variables

corr_vars <- c("priorfrac", "premeno", "momfrac", "armassist", "smoke", "raterisk", "fracture", "bonemed", "bonemed_fu", "bonetreat")

set.seed(1234)
fitControl = trainControl(method = "repeatedcv", number = 10, repeats = 1)
nb.fit = train(fracture ~ .,
               data = trainingDataframe[, corr_vars],
               method = "nb",
               trControl = fitControl
               )
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 16
nb.fit
## Naive Bayes 
## 
## 400 samples
##   9 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times) 
## Summary of sample sizes: 359, 361, 360, 360, 359, 361, ... 
## Resampling results across tuning parameters:
## 
##   usekernel  Accuracy   Kappa   
##   FALSE      0.6731191  0.163332
##    TRUE      0.7400891  0.000000
## 
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
##  parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = TRUE and adjust
##  = 1.

Knn models

corr_vars <- c("age", "weight", "height", "bmi", "fracscore", "fracture")

set.seed(1234)
fitControl = trainControl(method = "repeatedcv", number = 10, repeats = 1)
knn.fit = train(fracture ~ age,
               data = trainingDataframe[, corr_vars],
               method = "knn",
               trControl = fitControl,
               tuneGrid = expand.grid(k = c(1:10, 20, 30))
               )
knn.fit
## k-Nearest Neighbors 
## 
## 400 samples
##   1 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times) 
## Summary of sample sizes: 359, 361, 360, 360, 359, 361, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa      
##    1  0.7178299   0.04451770
##    2  0.7252048   0.04095402
##    3  0.7302720   0.05828196
##    4  0.7202689   0.03291690
##    5  0.7327720   0.05605753
##    6  0.7328971   0.06388898
##    7  0.7454644   0.08867516
##    8  0.7428361   0.07440775
##    9  0.7454644   0.07173952
##   10  0.7276438  -0.01393495
##   20  0.7299578   0.01388032
##   30  0.7302079  -0.00185982
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.

After tinkering it seems the best model only uses the age variable

knnfit.predprobs.valid = predict(knn.fit, testingDataframe, type = "prob")[,"Yes"]
knnfit.roc.valid = roc(response = testingDataframe$fracture, predictor = knnfit.predprobs.valid, levels = c("No","Yes"))
## Setting direction: controls < cases
plot(knnfit.roc.valid, print.thres = "best", col = "lightblue", main = "Best threshold for Knn validation data set")

auc(knnfit.roc.valid)
## Area under the curve: 0.6389
corr_vars <- c("age", "weight", "height", "bmi", "fracscore", "fracture")

set.seed(1234)
fitControl = trainControl(method = "repeatedcv", number = 10, repeats = 1)
knn.fit = train(fracture ~ age,
               data = trainingDataframe[, corr_vars],
               method = "knn",
               trControl = fitControl,
               tuneGrid = expand.grid(k = c(1:10, 20, 30))
               )
knn.fit
## k-Nearest Neighbors 
## 
## 400 samples
##   1 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times) 
## Summary of sample sizes: 359, 361, 360, 360, 359, 361, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa      
##    1  0.7178299   0.04451770
##    2  0.7252048   0.04095402
##    3  0.7302720   0.05828196
##    4  0.7202689   0.03291690
##    5  0.7327720   0.05605753
##    6  0.7328971   0.06388898
##    7  0.7454644   0.08867516
##    8  0.7428361   0.07440775
##    9  0.7454644   0.07173952
##   10  0.7276438  -0.01393495
##   20  0.7299578   0.01388032
##   30  0.7302079  -0.00185982
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
forestModel = randomForest(fracture ~ priorfrac + age + premeno + momfrac + armassist + smoke + fracscore + bonemed + bonemed_fu, 
  data = trainingDataframe, 
  ntree = 500, 
  mtry = 3, 
  imprtance = TRUE)

forestPredictions = predict(forestModel, testingDataframe)

confusionMatrix(forestPredictions, testingDataframe$fracture)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  74  15
##        Yes  5   6
##                                           
##                Accuracy : 0.8             
##                  95% CI : (0.7082, 0.8733)
##     No Information Rate : 0.79            
##     P-Value [Acc > NIR] : 0.46055         
##                                           
##                   Kappa : 0.2695          
##                                           
##  Mcnemar's Test P-Value : 0.04417         
##                                           
##             Sensitivity : 0.9367          
##             Specificity : 0.2857          
##          Pos Pred Value : 0.8315          
##          Neg Pred Value : 0.5455          
##              Prevalence : 0.7900          
##          Detection Rate : 0.7400          
##    Detection Prevalence : 0.8900          
##       Balanced Accuracy : 0.6112          
##                                           
##        'Positive' Class : No              
## 
forestfit.predprobs.valid = predict(forestModel, testingDataframe, type = "prob")[,"Yes"]
forestfit.roc.valid = roc(response = testingDataframe$fracture, predictor = forestfit.predprobs.valid, levels = c("No","Yes"))
## Setting direction: controls < cases
plot(forestfit.roc.valid, print.thres = "best", col = "black", main = "Best threshold for random forest validation data set")

auc(forestfit.roc.valid)
## Area under the curve: 0.711
fitControl = trainControl(method = "repeatedcv", number = 10, repeats = 1, classProbs = TRUE, summaryFunction = mnLogLoss)

set.seed(4)

trainingDataframe = trainingDataframe[, !(names(trainingDataframe) %in% c("site_id_factor", "site_id"))]
testingDataframe = testingDataframe[, !(names(testingDataframe) %in% c("site_id_factor", "site_id"))]

colnames(trainingDataframe)[which.min(apply(trainingDataframe, 2, var))]
## Warning in stats::var(...): NAs introduced by coercion

## Warning in stats::var(...): NAs introduced by coercion

## Warning in stats::var(...): NAs introduced by coercion

## Warning in stats::var(...): NAs introduced by coercion

## Warning in stats::var(...): NAs introduced by coercion

## Warning in stats::var(...): NAs introduced by coercion

## Warning in stats::var(...): NAs introduced by coercion

## Warning in stats::var(...): NAs introduced by coercion

## Warning in stats::var(...): NAs introduced by coercion

## Warning in stats::var(...): NAs introduced by coercion
## [1] "fracscore"

{r} # #Version 1 # lda.fit = caret::train(fracture ~ . - sub_id - phy_id + priorfrac:fracscore + age:fracscore + fracscore:bonetreat, # data = trainingDataframe, # method = "lda", # trControl = fitControl, # preProc = c("center", "scale"), # metric = "logLoss") # # lda.fit # logLoss = 0.5702 # # #Computing predicted probabilities on the testing data # ldafit.predprobs.valid = predict(lda.fit, testingDataframe, type = "prob")[,"Yes"] # # # ldafit.roc.valid = roc(response = testingDataframe$fracture, predictor = ldafit.predprobs.valid, levels = c("No", "Yes")) #

{r} # complex1 = glm(fracture ~ age + bonetreat + fracscore + priorfrac + bonemed + bonemed_fu + priorfrac:fracscore + age:fracscore + fracscore:bonetreat, data = trainingDataframe, family = "binomial") # # # Get Predictions # #Complex model from previous # complex1.predprobs<-predict(complex1,trainingDataframe ,type="response") # # # complex model ROC # complex1.roc = roc(response=trainingDataframe$fracture,predictor=complex1.predprobs,levels=c("No","Yes")) # # validateComplexPred <- predict(complex1, newdata = testingDataframe, type="response") # # # complex model ROC # complex1.roc.Valid <- roc(response=testingDataframe$fracture,predictor=validateComplexPred,levels=c("No","Yes")) #

{r} # # Plot complex, lda models, and random forest # # #plot(complex1.roc.Valid,col="red") # # #plot(complex1.roc.Valid,print.thres="best",col="red") # use to print best threshold - gets messy with everything else # # #plot(ldafit.roc.valid, col="lightblue", add = T) # # # plot(ldafit.roc, col="lightblue", add = T, legend = T, print.thres="best") # use to print best threshold # # # #INSERT KNN MODEL ROC HERE SKELETON CODE MOST LIKELY: # plot.new() # plot(0,0, xlim=c(0,1), ylim=c(0,1), type="n", main="ROC curve") # # plot(knnfit.roc.valid, add = T, col = "green") # plot(complex1.roc.Valid,col="red") # plot(forestfit.roc.valid, add = T, col = "black", legend = T) # # # plot(rf.result.roc,print.thres="best", add = T, col = "black", legend = T) # # # # use to print best threshold # legend("bottomright", # legend=c("Complex", "LDA", "KNN", "Random Forest"), # col=c("red", "lightblue", "green", "black"), # lwd=4, cex =1, xpd = TRUE, horiz = FALSE) #

#Sources:
Hosmer, D.W., Lemeshow, S. and Sturdivant, R.X. (2013) Applied Logistic Regression, 3rd ed., New York: Wiley

https://cran.r-project.org/web/packages/aplore3/aplore3.pdf#page=11&zoom=100,132,90